I choose @aliazaita and @LaniPliopa as the two Youtube channel to explore, as they are two vloggers I subscribed, have similar amount of subscribers, 465K subscribers and 395K subscribers, respectively and are both lifestyle volgger.
Two ideas that I had before I accessed the data for visualisations:
Process of creating my data visualisations:
geom_point + geom_smooth to
find a relationship between the duration time and views count, but I did
not find some explicit relationships. On the title slide, I chose 4
screenshots from the alia zaita and Lani Pliopa channels and composited
them together using
image_append(). For data
visualisations, I used different different geom, such as
geom_boxplot(),geom_bar(),geom_density()
and geom_jitter(). On the final slide, I also add a meme to
make the story more interesting.
From the Module 4: Creating static and dynamic visualisations, I
learned to how to use geom_layer(s), the aes()
function, scale_layers(),facet_wrap(), such as
geom_layer(s) is to visualise the data points by using
different shapes, aes() maps variables from your data to
axes, positions, colours, size, shape, etc., facet_wrap()
creates subplots for your visualisation, also referred to as “small
multiples”, scale_layers() adjusts the scales used to
visualise the data, including colour and other “visual” scales, etc.,
labs()adds labels to your visualisation, and
guides() modifies legends (guides) e.g. change position,
don’t show them.
In the future, I want to learn more about how to analyse the data using some packages and fuctions on the {ggplot2} on the R.
library(magick)
library(tidyverse)
#A colour palette
my_colors <- c("#E59BE9","#FFCDEA", "#FB9AD1" ,"#86469C")
# Reading data from published CSV file into a data frame
csv_file <- "https://docs.google.com/spreadsheets/d/e/2PACX-1vQbRa6I6QQnUvKHE0RC8wnaPpUwPacwbtb6VqTJwGEQ1Sf3iW8Vbl3Yii--w43MAztzTHnVcEQf0Srz/pub?output=csv"
youtube_data <- read_csv(csv_file)
#I want to explore what can influence the amount of views.
#Firstly, I want to explore the relationship between the duration and the view count of the video
duration_views_data <- youtube_data %>%
mutate(duration_minutes = round(duration/60,2),
views_in_thousands = round(viewCount/1000,2),
views_amount_level = case_when(views_in_thousands <= 10 ~ "low views",
views_in_thousands <= 100 ~ "Moderate views",
views_in_thousands <= 1000 ~ "High views",
views_in_thousands <= 10000 ~ "Very high views"))
duration_views_data %>%
ggplot(aes(x = duration_minutes,
y = reorder(views_amount_level, duration_minutes),
colour = views_amount_level)) +
geom_jitter(height = 0.2) +
geom_boxplot(fill=NA) +
facet_wrap(vars(channelName)) +
scale_color_manual(values = my_colors ) +
labs( title = "What is the duration of a highly viewed video?" ,
x = "duration minutes",
y = "The level of views amount",
caption = "Source: YouTube API"
)+
theme_minimal()+
guides(color = FALSE)+
theme(
plot.title = element_text(size = 20, face = "bold")
)
ggsave("plot3.png")
# The mean duration minutes of high views videos is around 14 min for @aliazaita.
specific_duration_views_data_1 <- duration_views_data %>%
filter(channelName =="@aliazaita") %>%
group_by(views_amount_level) %>%
summarise(mean_duration_minutes = mean(duration_minutes))
# The mean duration minutes of high views videos is around 10 min for @LaniPliopa.
specific_duration_views_data_2 <- duration_views_data %>%
filter(channelName =="@LaniPliopa") %>%
group_by(views_amount_level) %>%
summarise(mean_duration_minutes = mean(duration_minutes))
#Then, I want to see the distribution of video duration of the two channel.
duration_data <- duration_views_data %>%
select(duration_minutes,channelName)
channel_colour <- c(my_colors[3],my_colors[4])
duration_data %>%
ggplot() +
geom_density(aes(x = duration_minutes,fill= channelName))+
facet_wrap(vars(channelName)) +
scale_fill_manual(values = channel_colour) +
labs(title = "How long is typical video durations of two channels?",
caption = "Source: YouTube API",
x = "duration minutes") +
theme_minimal()+
theme(
plot.title = element_text(size = 20, face = "bold")
)
ggsave("plot4.png")
more_100_thousands_views <- duration_views_data %>%
filter(views_in_thousands >= 100) %>%
group_by(channelName) %>%
summarise(ratio_of_vedieos = n()/100)
#Secondly, I want to explore the relationship between published time in a day and views count.
time_views_data <- duration_views_data %>%
mutate(published_time = datePublished%>% str_sub(12, 13) %>% parse_number()) %>%
select(channelName,published_time,views_in_thousands)%>%
group_by(channelName,published_time)%>%
summarise(mean_views_in_thousands = mean(views_in_thousands))
channel_colour <- c(my_colors[3],my_colors[4])
time_views_data %>%
ggplot() +
geom_bar(aes(x = published_time, y = mean_views_in_thousands,fill = channelName),
stat = "identity",
position = "dodge") +
labs(title = "Average views at different times of day of two channels",
x = "Published Time (hour of day)",
y = "Mean Views (in thousands)") +
scale_fill_manual(values = channel_colour) +
theme_minimal()+
theme(
plot.title = element_text(size = 20, face = "bold")
)
ggsave("plot2.png")
# When the published time was in the period from 1pm to 4pm, the videos in the LaniPliopa channel could get more views than other time.
# When the published time was in the period from 1pm to 5pm,or around 8pm, the videos in the LaniPliopa channel could get more views than other time.
#Thirdly, I want to explore the top 10 common words by two channel.
not_useful_words <- c("in", "i", "a","my","to","the","for","with","me","days","this","day","what","as","good","new","and","of")
#I choose top 10 common words between titles which are meaningful by each two channel.
top_ten_common_words_in_title<- duration_views_data %>%
select(title,channelName) %>%
separate_rows(title, sep = " ") %>%
mutate(clean_word = str_to_lower(title)%>%
str_remove_all("[^[:alnum:]]")%>%
str_remove_all("[[:punct:]]")%>%
str_remove_all("[[:digit:]]")) %>%
filter(!clean_word == "",
!clean_word %in% not_useful_words)%>%
group_by(channelName, clean_word) %>%
summarise(n = n()) %>%
arrange(channelName, desc(n)) %>%
group_by(channelName)%>%
top_n(10) %>%
ungroup()
channel_colour <- c(my_colors[3],my_colors[4])
top_ten_common_words_in_title %>%
ggplot()+
geom_bar(aes(x = n,
y = clean_word,
fill = channelName),
position = "dodge",
stat = "identity")+
scale_fill_manual(values = channel_colour) +
labs(title = "Top 10 Common Words in Titles by Two Channel",
caption = "Source: YouTube API",
x = "The total number of words in titles",
y = "Word"
)+
theme_minimal() +
theme(axis.text.y = element_text(size = 10),
plot.title = element_text(size = 20, face = "bold"))
ggsave("plot1.png")
library(magick)
library(tidyverse)
my_colors <- c("#E59BE9","#FFCDEA", "#FB9AD1" ,"#86469C","#EEF7FF")
text_square <- image_blank(width = 600, height = 400, color = my_colors[5])
image_box <- image_blank(width = 600, height = 400, color = my_colors[5])
#Frame1-a title slide
text_1 <- "An exploration of the data of the two Youtube channels, @aliazaita and @LaniPliopa,
both are as life vloggers and have similar subscribers counts. We will focus on their difference on
view counts, published times, video durations, and content topics of the videos." %>% str_wrap(50)
text_part_1 <- text_square %>%
image_annotate(text = text_1,
size = 20,
gravity = "center" )
aliazaita_image_1 <- image_read(path ="aliazaita_1.png")%>%
image_scale(300)
laniPliopa_image_1 <- image_read(path ="LaniPliopa_1.png")%>%
image_scale(300)
aliazaita_image_2 <- image_read(path ="aliazaita_2.png")%>%
image_scale(300)
laniPliopa_image_2 <- image_read(path ="LaniPliopa_2.png")%>%
image_scale(300)
row1 = c(aliazaita_image_1,laniPliopa_image_1) %>% image_append()
row2 = c(aliazaita_image_2,laniPliopa_image_2) %>% image_append()
pictures_1 <- c(row1, row2)%>%
image_append(stack = TRUE)%>%
image_scale(600)
image_part_1 <- image_composite(image_box, pictures_1, offset = "+10+28")
frame1 <- c(image_part_1,text_part_1) %>%
image_append()
frame1
#slide 2
text_2 <- "Exploring the titles of each channel revealed some overlaps in
the top 10 most frequent words. Some words appeared frequently in both two channels,
such as week, vlog, life.This shared vocabulary implied there were some similar contents or themes
in the two channels. The other words can implied some other contents or elements appeared frequently." %>%
str_wrap(48)
text_part_2 <- text_square %>%
image_annotate(text = text_2,
size = 20,
gravity = "center" )
plot_1 = image_read("plot1.png") %>%
image_scale(500)
image_part_2 <- image_composite(image_box,plot_1 , offset = "+60+28")
frame2 <- c(image_part_2,text_part_2) %>%
image_append()
frame2
#slide 3
text_3 <- "After exploring the relationship between published time in a day and views count,
we found the amount mean views are high when published time was from 1pm to 4pm for the two channels.
However, @aliazaita also can get very high views when published time was around 8pm. In addition, most of
videos in two channels was published from 12am to 10pm." %>% str_wrap(50)
text_part_3 <- text_square %>%
image_annotate(text = text_3,
size = 20,
gravity = "center" )
plot_2 = image_read("plot2.png") %>%
image_scale(500)
image_part_3 <- image_composite(image_box,plot_2 , offset = "+80+28")
frame3 <- c(image_part_3,text_part_3) %>%
image_append()
frame3
#silde 4
text_4 <- "The duration time of most of videos form @aliazaita is below 1 min or around 15 mins.
And the duration time of most of videos form @LaniPliopa is around 10 mins." %>% str_wrap(50)
text_part_4 <- text_square %>%
image_annotate(text = text_4,
size = 20,
gravity = "center" )
plot_4 = image_read("plot4.png") %>%
image_scale(500)
image_part_4 <- image_composite(image_box, plot_4, offset = "+80+28")
frame4 <- c(image_part_4,text_part_4) %>%
image_append()
frame4
#slide 5
text_5 <- "There are more high views video from @aliazaita channel than @LaniPliopa.The
median of durations of videos which got high or very high views is around 16 minutes in the
@aliazaita channel.The median of durations of videos which got high or very high views is around
10 minutes in the @LaniPliopa channel." %>% str_wrap(50)
text_part_5 <- text_square %>%
image_annotate(text = text_5,
size = 20,
gravity = "center" )
plot_3 = image_read("plot3.png") %>%
image_scale(480)
image_part_5 <- image_composite(image_box, plot_3, offset = "+80+28")
frame5 <- c(image_part_5,text_part_5) %>%
image_append()
frame5
#final slides
text_6 <-"Overall, I learned that the some differences and similarities between
the YouTube channels of @aliazaita and @LaniPliopa.There are some overlaps in the vocabularies used in titles,
indicating shared content themes such as life, vlog, and week. Both channels experience peak views
during certain hours of the day, with @aliazaita demonstrating the ability to attract high views even during non-peak hours.
The analysis reveals a disparity in high views videos between the two channels, with @aliazaita consistently
outperforming @LaniPliopa in this regard. @Aliazaita tends to produce longer videos on average compared to @LaniPliopa,
with median durations of 16 minutes and 10 minutes, respectively, for high views videos."%>% str_wrap(50)
text_part_6 <- text_square %>%
image_annotate(text = text_6,
size = 17,
gravity = "center" )
img_6 = image_read("img_6.png") %>%
image_scale(400)
image_part_6 <- image_composite(image_box, img_6, offset = "+150+8")
frame6 <- c(image_part_6,text_part_6) %>%
image_append()
frame6
#creating an animation
frames <- c(rep(frame1, 8), rep(frame2, 8),rep(frame3, 8),rep(frame4, 8),rep(frame5, 8),rep(frame6, 8))
data_story <-image_animate(frames, fps = 1)
image_write(data_story,"data_story.gif")